Over 370000 used cars scraped with Scrapy from Ebay-Kleinanzeigen.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Any results you write to the current directory are saved as output.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import math
import scipy.stats as statsdataset_path = "autos.csv" # Replace with the actual path to your dataset
df = pd.read_csv(dataset_path, encoding='latin-1')# Display the first few rows of the dataset
print("Head of the dataset:")
print(df.head())
# Check the dimensions of the dataset
print("Dimensions:")
print(df.shape)
# Display the column names
print("Column names:")
print(df.columns)Head of the dataset:
dateCrawled name seller offerType \
0 2016-03-24 11:52:17 Golf_3_1.6 privat Angebot
1 2016-03-24 10:58:45 A5_Sportback_2.7_Tdi privat Angebot
2 2016-03-14 12:52:21 Jeep_Grand_Cherokee_"Overland" privat Angebot
3 2016-03-17 16:54:04 GOLF_4_1_4__3TÃRER privat Angebot
4 2016-03-31 17:25:20 Skoda_Fabia_1.4_TDI_PD_Classic privat Angebot
price abtest vehicleType yearOfRegistration gearbox powerPS model \
0 480 test NaN 1993 manuell 0 golf
1 18300 test coupe 2011 manuell 190 NaN
2 9800 test suv 2004 automatik 163 grand
3 1500 test kleinwagen 2001 manuell 75 golf
4 3600 test kleinwagen 2008 manuell 69 fabia
kilometer monthOfRegistration fuelType brand notRepairedDamage \
0 150000 0 benzin volkswagen NaN
1 125000 5 diesel audi ja
2 125000 8 diesel jeep NaN
3 150000 6 benzin volkswagen nein
4 90000 7 diesel skoda nein
dateCreated nrOfPictures postalCode lastSeen
0 2016-03-24 00:00:00 0 70435 2016-04-07 03:16:57
1 2016-03-24 00:00:00 0 66954 2016-04-07 01:46:50
2 2016-03-14 00:00:00 0 90480 2016-04-05 12:47:46
3 2016-03-17 00:00:00 0 91074 2016-03-17 17:40:17
4 2016-03-31 00:00:00 0 60437 2016-04-06 10:17:21
Dimensions:
(371528, 20)
Column names:
Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest',
'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
'kilometer', 'monthOfRegistration', 'fuelType', 'brand',
'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
'lastSeen'],
dtype='object')
# Get the summary statistics of the numerical columns
print("Summary statistics:")
df.describe()Summary statistics:
| price | yearOfRegistration | powerPS | kilometer | monthOfRegistration | nrOfPictures | postalCode | |
|---|---|---|---|---|---|---|---|
| count | 3.715280e+05 | 371528.000000 | 371528.000000 | 371528.000000 | 371528.000000 | 371528.0 | 371528.00000 |
| mean | 1.729514e+04 | 2004.577997 | 115.549477 | 125618.688228 | 5.734445 | 0.0 | 50820.66764 |
| std | 3.587954e+06 | 92.866598 | 192.139578 | 40112.337051 | 3.712412 | 0.0 | 25799.08247 |
| min | 0.000000e+00 | 1000.000000 | 0.000000 | 5000.000000 | 0.000000 | 0.0 | 1067.00000 |
| 25% | 1.150000e+03 | 1999.000000 | 70.000000 | 125000.000000 | 3.000000 | 0.0 | 30459.00000 |
| 50% | 2.950000e+03 | 2003.000000 | 105.000000 | 150000.000000 | 6.000000 | 0.0 | 49610.00000 |
| 75% | 7.200000e+03 | 2008.000000 | 150.000000 | 150000.000000 | 9.000000 | 0.0 | 71546.00000 |
| max | 2.147484e+09 | 9999.000000 | 20000.000000 | 150000.000000 | 12.000000 | 0.0 | 99998.00000 |
# Check the data types of each column
print("Data types:")
print(df.dtypes) Data types:
dateCrawled object
name object
seller object
offerType object
price int64
abtest object
vehicleType object
yearOfRegistration int64
gearbox object
powerPS int64
model object
kilometer int64
monthOfRegistration int64
fuelType object
brand object
notRepairedDamage object
dateCreated object
nrOfPictures int64
postalCode int64
lastSeen object
dtype: object
Clean the dataset by handling missing values, removing irrelevant columns, and converting data types if needed.
# Check for missing values
print("Missing values:")
df.isnull().sum()Missing values:
dateCrawled 0
name 0
seller 0
offerType 0
price 0
abtest 0
vehicleType 37869
yearOfRegistration 0
gearbox 20209
powerPS 0
model 20484
kilometer 0
monthOfRegistration 0
fuelType 33386
brand 0
notRepairedDamage 72060
dateCreated 0
nrOfPictures 0
postalCode 0
lastSeen 0
dtype: int64
# Calculate the percentage of null values in each column
null_percentage = (df.isnull().sum() / len(df)) * 100
print(null_percentage)dateCrawled 0.000000
name 0.000000
seller 0.000000
offerType 0.000000
price 0.000000
abtest 0.000000
vehicleType 10.192771
yearOfRegistration 0.000000
gearbox 5.439429
powerPS 0.000000
model 5.513447
kilometer 0.000000
monthOfRegistration 0.000000
fuelType 8.986133
brand 0.000000
notRepairedDamage 19.395577
dateCreated 0.000000
nrOfPictures 0.000000
postalCode 0.000000
lastSeen 0.000000
dtype: float64
# Convert the date column to datetime format
date_column = "dateCrawled" # Replace with the actual name of your date column
df[date_column] = pd.to_datetime(df[date_column])# Create separate date and time columns
df['date'] = df[date_column].dt.date
df['time'] = df[date_column].dt.time
# change datatype from object to date time
df['date'] = pd.to_datetime(df['date'])
df['time'] = pd.to_datetime(df['time'],format='%H:%M:%S').dt.strftime('%H:%M:%S')# Change the data type of column 'A' to integer
df['postalCode'] = df['postalCode'].astype(object)# Remove null values in a specific column about 10% of the data
column_with_null = ['vehicleType','model','fuelType'] # Replace with the actual column name
df_without_nulls = df.dropna(subset= column_with_null)
df_without_nulls.isnull().sum()dateCrawled 0
name 0
seller 0
offerType 0
price 0
abtest 0
vehicleType 0
yearOfRegistration 0
gearbox 7258
powerPS 0
model 0
kilometer 0
monthOfRegistration 0
fuelType 0
brand 0
notRepairedDamage 41748
dateCreated 0
nrOfPictures 0
postalCode 0
lastSeen 0
date 0
time 0
dtype: int64
# gearbox
# Replace null values with "NA" in a specific column for no information collected
column_of_interest = 'gearbox' # Replace with the actual column name
df_without_nulls[column_of_interest].fillna("NA", inplace=True)/var/folders/pf/1lj5fzks0wl1x9ydm5s588q00000gn/T/ipykernel_4297/1978238027.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# notRepairedDamage : if the car has a damage which is not repaired yet
# Replace null values with "NA" in a specific column for no information collected
column_of_interest = 'notRepairedDamage' # Replace with the actual column name
df_without_nulls[column_of_interest].fillna("NA", inplace=True)/var/folders/pf/1lj5fzks0wl1x9ydm5s588q00000gn/T/ipykernel_4297/3712268988.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# Count of repaired Damage
df_without_nulls['notRepairedDamage'].value_counts()nein 236725
NA 41748
ja 28649
Name: notRepairedDamage, dtype: int64
df_without_nulls.isnull().sum()dateCrawled 0
name 0
seller 0
offerType 0
price 0
abtest 0
vehicleType 0
yearOfRegistration 0
gearbox 0
powerPS 0
model 0
kilometer 0
monthOfRegistration 0
fuelType 0
brand 0
notRepairedDamage 0
dateCreated 0
nrOfPictures 0
postalCode 0
lastSeen 0
date 0
time 0
dtype: int64
# df_without_nulls
# # Replace invalid values with NaN
# df.loc[df["yearOfRegistration"] == 1111, "yearOfRegistration"] = np.nan
# Convert yearOfRegistration to datetime format
df_without_nulls["yearOfRegistration"] = pd.to_datetime(df_without_nulls["yearOfRegistration"], format="%Y")/var/folders/pf/1lj5fzks0wl1x9ydm5s588q00000gn/T/ipykernel_4297/2823635449.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# Drop irrelevant columns
# df_without_nulls = df_without_nulls['date', 'time', 'name', 'seller', 'offerType', 'price', 'abtest',
# 'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
# 'kilometer', 'monthOfRegistration', 'fuelType', 'brand',
# 'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode' ]
df_without_nulls = df_without_nulls.drop(['dateCrawled', 'lastSeen' ], axis=1)df_without_nulls = df_without_nulls[['date', 'time', 'name', 'seller', 'offerType', 'price', 'abtest',
'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
'kilometer', 'monthOfRegistration', 'fuelType', 'brand',
'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode' ]]
df_without_nulls| date | time | name | seller | offerType | price | abtest | vehicleType | yearOfRegistration | gearbox | powerPS | model | kilometer | monthOfRegistration | fuelType | brand | notRepairedDamage | dateCreated | nrOfPictures | postalCode | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 2016-03-14 | 12:52:21 | Jeep_Grand_Cherokee_"Overland" | privat | Angebot | 9800 | test | suv | 2004-01-01 | automatik | 163 | grand | 125000 | 8 | diesel | jeep | NA | 2016-03-14 00:00:00 | 0 | 90480 |
| 3 | 2016-03-17 | 16:54:04 | GOLF_4_1_4__3TÃRER | privat | Angebot | 1500 | test | kleinwagen | 2001-01-01 | manuell | 75 | golf | 150000 | 6 | benzin | volkswagen | nein | 2016-03-17 00:00:00 | 0 | 91074 |
| 4 | 2016-03-31 | 17:25:20 | Skoda_Fabia_1.4_TDI_PD_Classic | privat | Angebot | 3600 | test | kleinwagen | 2008-01-01 | manuell | 69 | fabia | 90000 | 7 | diesel | skoda | nein | 2016-03-31 00:00:00 | 0 | 60437 |
| 5 | 2016-04-04 | 17:36:23 | BMW_316i___e36_Limousine___Bastlerfahrzeug__Ex... | privat | Angebot | 650 | test | limousine | 1995-01-01 | manuell | 102 | 3er | 150000 | 10 | benzin | bmw | ja | 2016-04-04 00:00:00 | 0 | 33775 |
| 6 | 2016-04-01 | 20:48:51 | Peugeot_206_CC_110_Platinum | privat | Angebot | 2200 | test | cabrio | 2004-01-01 | manuell | 109 | 2_reihe | 150000 | 8 | benzin | peugeot | nein | 2016-04-01 00:00:00 | 0 | 67112 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 371521 | 2016-03-27 | 20:36:20 | Opel_Zafira_1.6_Elegance_TÃV_12/16 | privat | Angebot | 1150 | control | bus | 2000-01-01 | manuell | 0 | zafira | 150000 | 3 | benzin | opel | nein | 2016-03-27 00:00:00 | 0 | 26624 |
| 371524 | 2016-03-05 | 19:56:21 | Smart_smart_leistungssteigerung_100ps | privat | Angebot | 1199 | test | cabrio | 2000-01-01 | automatik | 101 | fortwo | 125000 | 3 | benzin | smart | nein | 2016-03-05 00:00:00 | 0 | 26135 |
| 371525 | 2016-03-19 | 18:57:12 | Volkswagen_Multivan_T4_TDI_7DC_UY2 | privat | Angebot | 9200 | test | bus | 1996-01-01 | manuell | 102 | transporter | 150000 | 3 | diesel | volkswagen | nein | 2016-03-19 00:00:00 | 0 | 87439 |
| 371526 | 2016-03-20 | 19:41:08 | VW_Golf_Kombi_1_9l_TDI | privat | Angebot | 3400 | test | kombi | 2002-01-01 | manuell | 100 | golf | 150000 | 6 | diesel | volkswagen | NA | 2016-03-20 00:00:00 | 0 | 40764 |
| 371527 | 2016-03-07 | 19:39:19 | BMW_M135i_vollausgestattet_NP_52.720____Euro | privat | Angebot | 28990 | control | limousine | 2013-01-01 | manuell | 320 | m_reihe | 50000 | 8 | benzin | bmw | nein | 2016-03-07 00:00:00 | 0 | 73326 |
307122 rows × 20 columns
from translate import Translator
import pandas as pd
# Function to translate a list of unique words from German to English
def translate_list(unique_words):
translator = Translator(to_lang='en', from_lang='de')
translations = [translator.translate(word) for word in unique_words]
return translations
# Define the unique words in German
seller = ['privat', 'gewerblich']
offerType = ['Angebot', 'Gesuch']
vehicleType = ['suv', 'kleinwagen', 'limousine', 'cabrio', 'bus', 'kombi', 'coupe', 'andere']
gearbox = ['automatik', 'manuell', 'NA']
fuelType = ['diesel', 'benzin', 'lpg', 'andere', 'hybrid', 'cng', 'elektro']
notRepairedDamage = ['NA', 'nein', 'ja']
# Translate the unique words from German to English
translated_seller = translate_list(seller)
translated_offerType = translate_list(offerType)
translated_vehicleType = translate_list(vehicleType)
translated_gearbox = translate_list(gearbox)
translated_fuelType = translate_list(fuelType)
translated_notRepairedDamage = translate_list(notRepairedDamage)
# Create the translation dictionaries
seller_dict = dict(zip(seller, translated_seller))
offerType_dict = dict(zip(offerType, translated_offerType))
vehicleType_dict = dict(zip(vehicleType, translated_vehicleType))
gearbox_dict = dict(zip(gearbox, translated_gearbox))
fuelType_dict = dict(zip(fuelType, translated_fuelType))
notRepairedDamage_dict = dict(zip(notRepairedDamage, translated_notRepairedDamage))
df = df_without_nulls
# Replace German words with English using the translation dictionaries
df['seller'] = df['seller'].map(seller_dict)
df['offerType'] = df['offerType'].map(offerType_dict)
df['vehicleType'] = df['vehicleType'].map(vehicleType_dict)
df['gearbox'] = df['gearbox'].map(gearbox_dict)
df['fuelType'] = df['fuelType'].map(fuelType_dict)
df['notRepairedDamage'] = df['notRepairedDamage'].map(notRepairedDamage_dict)
# Print the updated DataFrame
df| date | time | name | seller | offerType | price | abtest | vehicleType | yearOfRegistration | gearbox | powerPS | model | kilometer | monthOfRegistration | fuelType | brand | notRepairedDamage | dateCreated | nrOfPictures | postalCode | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 2016-03-14 | 12:52:21 | Jeep_Grand_Cherokee_"Overland" | private | Offer | 9800 | test | SUV | 2004-01-01 | Automatic | 163 | grand | 125000 | 8 | Diesel | jeep | NA | 2016-03-14 00:00:00 | 0 | 90480 |
| 3 | 2016-03-17 | 16:54:04 | GOLF_4_1_4__3TÃRER | private | Offer | 1500 | test | Small car | 2001-01-01 | Manual | 75 | golf | 150000 | 6 | Gasoline | volkswagen | No | 2016-03-17 00:00:00 | 0 | 91074 |
| 4 | 2016-03-31 | 17:25:20 | Skoda_Fabia_1.4_TDI_PD_Classic | private | Offer | 3600 | test | Small car | 2008-01-01 | Manual | 69 | fabia | 90000 | 7 | Diesel | skoda | No | 2016-03-31 00:00:00 | 0 | 60437 |
| 5 | 2016-04-04 | 17:36:23 | BMW_316i___e36_Limousine___Bastlerfahrzeug__Ex... | private | Offer | 650 | test | Limousine | 1995-01-01 | Manual | 102 | 3er | 150000 | 10 | Gasoline | bmw | Yes | 2016-04-04 00:00:00 | 0 | 33775 |
| 6 | 2016-04-01 | 20:48:51 | Peugeot_206_CC_110_Platinum | private | Offer | 2200 | test | Convertible | 2004-01-01 | Manual | 109 | 2_reihe | 150000 | 8 | Gasoline | peugeot | No | 2016-04-01 00:00:00 | 0 | 67112 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 371521 | 2016-03-27 | 20:36:20 | Opel_Zafira_1.6_Elegance_TÃV_12/16 | private | Offer | 1150 | control | Bus | 2000-01-01 | Manual | 0 | zafira | 150000 | 3 | Gasoline | opel | No | 2016-03-27 00:00:00 | 0 | 26624 |
| 371524 | 2016-03-05 | 19:56:21 | Smart_smart_leistungssteigerung_100ps | private | Offer | 1199 | test | Convertible | 2000-01-01 | Automatic | 101 | fortwo | 125000 | 3 | Gasoline | smart | No | 2016-03-05 00:00:00 | 0 | 26135 |
| 371525 | 2016-03-19 | 18:57:12 | Volkswagen_Multivan_T4_TDI_7DC_UY2 | private | Offer | 9200 | test | Bus | 1996-01-01 | Manual | 102 | transporter | 150000 | 3 | Diesel | volkswagen | No | 2016-03-19 00:00:00 | 0 | 87439 |
| 371526 | 2016-03-20 | 19:41:08 | VW_Golf_Kombi_1_9l_TDI | private | Offer | 3400 | test | Combi | 2002-01-01 | Manual | 100 | golf | 150000 | 6 | Diesel | volkswagen | NA | 2016-03-20 00:00:00 | 0 | 40764 |
| 371527 | 2016-03-07 | 19:39:19 | BMW_M135i_vollausgestattet_NP_52.720____Euro | private | Offer | 28990 | control | Limousine | 2013-01-01 | Manual | 320 | m_reihe | 50000 | 8 | Gasoline | bmw | No | 2016-03-07 00:00:00 | 0 | 73326 |
307122 rows × 20 columns
Calculate summary statistics and gain insights into the distribution of numerical variables.
df["age"] = df["date"].dt.year - df["yearOfRegistration"].dt.year# Get the summary statistics of numerical columns
print("Summary statistics:")
print(df.describe())Summary statistics:
price powerPS kilometer monthOfRegistration \
count 3.071220e+05 307122.000000 307122.000000 307122.000000
mean 9.404320e+03 121.801932 125391.180052 6.067527
std 5.192426e+05 171.919845 39347.308065 3.551136
min 0.000000e+00 0.000000 5000.000000 0.000000
25% 1.399000e+03 75.000000 100000.000000 3.000000
50% 3.450000e+03 110.000000 150000.000000 6.000000
75% 7.980000e+03 150.000000 150000.000000 9.000000
max 1.000000e+08 20000.000000 150000.000000 12.000000
nrOfPictures age
count 307122.0 307122.000000
mean 0.0 13.096294
std 0.0 6.597030
min 0.0 -2.000000
25% 0.0 9.000000
50% 0.0 13.000000
75% 0.0 17.000000
max 0.0 106.000000
remove outliers from the "price" column in your DataFrame,
you can use statistical methods such as Z-score or interquartile range (IQR)
to identify and filter out the outlier values.
# Calculate the IQR
Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1
# Define the lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Filter the DataFrame to exclude outliers
df_filtered = df[(df["price"] >= lower_bound) & (df["price"] <= upper_bound)]# save filtered dataset
filename = 'filteredData.csv'
df_filtered.to_csv(filename, header = True, index=False)plt.figure(figsize=(8, 6))
sns.histplot(df_filtered["price"], bins=50) # Adjust the number of bins as needed
plt.title("Distribution of Price")
plt.xlabel("Price")
plt.ylabel("Count")
plt.show()<Figure size 800x600 with 0 Axes>
<Axes: xlabel='price', ylabel='Count'>
Text(0.5, 1.0, 'Distribution of Price')
Text(0.5, 0, 'Price')
Text(0, 0.5, 'Count')

As per the analysis, there is a 0.55 correlation between YearOfRegistration and Price of Car sales on Ebay
def calculate_correlation_matrix(data):
# Filter out non-numerical columns
numeric_columns = data.select_dtypes(include='number')
# Calculate the correlation matrix
correlation_matrix = numeric_columns.corr()
return correlation_matrix
def correlation_matrix_to_table(correlation_matrix):
# Convert the correlation matrix to a DataFrame
correlation_table = pd.DataFrame(correlation_matrix)
# Reset the index and rename the columns
correlation_table = correlation_table.reset_index().rename(columns={'index': 'Variable'})
return correlation_table
# Calculate the correlation matrix
correlation_matrix = calculate_correlation_matrix(df_filtered)
correlation_matrix_to_table(correlation_matrix)
# correlation_matrix = data.corr()
print("Correlation Matrix:")
correlation_matrix| Variable | price | powerPS | kilometer | monthOfRegistration | nrOfPictures | age | |
|---|---|---|---|---|---|---|---|
| 0 | price | 1.000000 | 0.160853 | -0.358620 | 0.063203 | NaN | -0.545362 |
| 1 | powerPS | 0.160853 | 1.000000 | 0.027274 | 0.019554 | NaN | -0.057651 |
| 2 | kilometer | -0.358620 | 0.027274 | 1.000000 | -0.007415 | NaN | 0.301899 |
| 3 | monthOfRegistration | 0.063203 | 0.019554 | -0.007415 | 1.000000 | NaN | -0.062286 |
| 4 | nrOfPictures | NaN | NaN | NaN | NaN | NaN | NaN |
| 5 | age | -0.545362 | -0.057651 | 0.301899 | -0.062286 | NaN | 1.000000 |
Correlation Matrix:
| price | powerPS | kilometer | monthOfRegistration | nrOfPictures | age | |
|---|---|---|---|---|---|---|
| price | 1.000000 | 0.160853 | -0.358620 | 0.063203 | NaN | -0.545362 |
| powerPS | 0.160853 | 1.000000 | 0.027274 | 0.019554 | NaN | -0.057651 |
| kilometer | -0.358620 | 0.027274 | 1.000000 | -0.007415 | NaN | 0.301899 |
| monthOfRegistration | 0.063203 | 0.019554 | -0.007415 | 1.000000 | NaN | -0.062286 |
| nrOfPictures | NaN | NaN | NaN | NaN | NaN | NaN |
| age | -0.545362 | -0.057651 | 0.301899 | -0.062286 | NaN | 1.000000 |
# Heatmap of correlations
print("Correlation Heatmap:")
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
# Generate scatter plot matrix
pd.plotting.scatter_matrix(df_filtered, figsize=(10, 10))
# Show the descriptive statistics table
print(stats)
# Show the generated plots
plt.show()Correlation Heatmap:
<Axes: >
/Users/admin/anaconda3/lib/python3.10/site-packages/pandas/plotting/_matplotlib/misc.py:101: UserWarning:
Attempting to set identical low and high xlims makes transformation singular; automatically expanding.
/Users/admin/anaconda3/lib/python3.10/site-packages/pandas/plotting/_matplotlib/misc.py:102: UserWarning:
Attempting to set identical low and high ylims makes transformation singular; automatically expanding.
/Users/admin/anaconda3/lib/python3.10/site-packages/pandas/plotting/_matplotlib/misc.py:92: UserWarning:
Attempting to set identical low and high xlims makes transformation singular; automatically expanding.
array([[<Axes: xlabel='price', ylabel='price'>,
<Axes: xlabel='powerPS', ylabel='price'>,
<Axes: xlabel='kilometer', ylabel='price'>,
<Axes: xlabel='monthOfRegistration', ylabel='price'>,
<Axes: xlabel='nrOfPictures', ylabel='price'>,
<Axes: xlabel='age', ylabel='price'>],
[<Axes: xlabel='price', ylabel='powerPS'>,
<Axes: xlabel='powerPS', ylabel='powerPS'>,
<Axes: xlabel='kilometer', ylabel='powerPS'>,
<Axes: xlabel='monthOfRegistration', ylabel='powerPS'>,
<Axes: xlabel='nrOfPictures', ylabel='powerPS'>,
<Axes: xlabel='age', ylabel='powerPS'>],
[<Axes: xlabel='price', ylabel='kilometer'>,
<Axes: xlabel='powerPS', ylabel='kilometer'>,
<Axes: xlabel='kilometer', ylabel='kilometer'>,
<Axes: xlabel='monthOfRegistration', ylabel='kilometer'>,
<Axes: xlabel='nrOfPictures', ylabel='kilometer'>,
<Axes: xlabel='age', ylabel='kilometer'>],
[<Axes: xlabel='price', ylabel='monthOfRegistration'>,
<Axes: xlabel='powerPS', ylabel='monthOfRegistration'>,
<Axes: xlabel='kilometer', ylabel='monthOfRegistration'>,
<Axes: xlabel='monthOfRegistration', ylabel='monthOfRegistration'>,
<Axes: xlabel='nrOfPictures', ylabel='monthOfRegistration'>,
<Axes: xlabel='age', ylabel='monthOfRegistration'>],
[<Axes: xlabel='price', ylabel='nrOfPictures'>,
<Axes: xlabel='powerPS', ylabel='nrOfPictures'>,
<Axes: xlabel='kilometer', ylabel='nrOfPictures'>,
<Axes: xlabel='monthOfRegistration', ylabel='nrOfPictures'>,
<Axes: xlabel='nrOfPictures', ylabel='nrOfPictures'>,
<Axes: xlabel='age', ylabel='nrOfPictures'>],
[<Axes: xlabel='price', ylabel='age'>,
<Axes: xlabel='powerPS', ylabel='age'>,
<Axes: xlabel='kilometer', ylabel='age'>,
<Axes: xlabel='monthOfRegistration', ylabel='age'>,
<Axes: xlabel='nrOfPictures', ylabel='age'>,
<Axes: xlabel='age', ylabel='age'>]], dtype=object)
<module 'scipy.stats' from '/Users/admin/anaconda3/lib/python3.10/site-packages/scipy/stats/__init__.py'>


# correlation btn price and age of car (No. of years )# Create scatter plot
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df_filtered["age"], y=df_filtered["price"])
plt.title("Age of Car vs. Price")
plt.xlabel("Age")
plt.ylabel("Price")
plt.show()<Figure size 800x600 with 0 Axes>
<Axes: xlabel='age', ylabel='price'>
Text(0.5, 1.0, 'Age of Car vs. Price')
Text(0.5, 0, 'Age')
Text(0, 0.5, 'Price')

# Analyze the distribution of a price vs age of car
plt.figure(figsize=(8, 6))
sns.histplot(df_filtered["age"])
plt.title("Distribution of Price")
plt.xlabel("age")
plt.ylabel("price")
plt.show()<Figure size 800x600 with 0 Axes>
<Axes: xlabel='age', ylabel='Count'>
Text(0.5, 1.0, 'Distribution of Price')
Text(0.5, 0, 'age')
Text(0, 0.5, 'price')

## TOP 10 MODELS
# Group the data by model and calculate the average price for each model
model_prices = df_filtered.groupby("model")["price"].mean().reset_index()
# Sort the data by price in descending order
model_prices = model_prices.sort_values("price", ascending=False)
# Select the top 10 models
top_10_models = model_prices.head(10)
# Set the figure size
plt.figure(figsize=(12, 8))
# Create the bar chart
plt.bar(top_10_models["model"], top_10_models["price"], color="b")
# Customize the chart
plt.title("Average Price for Top 10 Models")
plt.xlabel("Model")
plt.ylabel("Price")
plt.xticks(rotation=90) # Rotate x-axis labels for better readability
# Show the chart
plt.show()<Figure size 1200x800 with 0 Axes>
<BarContainer object of 10 artists>
Text(0.5, 1.0, 'Average Price for Top 10 Models')
Text(0.5, 0, 'Model')
Text(0, 0.5, 'Price')
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
[Text(0, 0, 'q7'),
Text(1, 0, 'glk'),
Text(2, 0, 'a5'),
Text(3, 0, 'q5'),
Text(4, 0, 'range_rover_sport'),
Text(5, 0, 'cc'),
Text(6, 0, 'a1'),
Text(7, 0, 'tiguan'),
Text(8, 0, 'yeti'),
Text(9, 0, 'kuga')])

## TOP 10 Brands
# Group the data by model and calculate the average price for each model
brand_prices = df_filtered.groupby("brand")["price"].mean().reset_index()
# Sort the data by price in descending order
brand_prices = brand_prices.sort_values("price", ascending=False)
# Select the top 10 models
top_10_brands = brand_prices.head(10)
# Set the figure size
plt.figure(figsize=(12, 8))
# Create the bar chart
plt.bar(top_10_brands["brand"], top_10_brands["price"], color="b")
# Customize the chart
plt.title("Average Price for Top 10 Brands")
plt.xlabel("Brands")
plt.ylabel("Price")
plt.xticks(rotation=90) # Rotate x-axis labels for better readability
# Show the chart
plt.show()<Figure size 1200x800 with 0 Axes>
<BarContainer object of 10 artists>
Text(0.5, 1.0, 'Average Price for Top 10 Brands')
Text(0.5, 0, 'Brands')
Text(0, 0.5, 'Price')
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
[Text(0, 0, 'porsche'),
Text(1, 0, 'mini'),
Text(2, 0, 'land_rover'),
Text(3, 0, 'jeep'),
Text(4, 0, 'jaguar'),
Text(5, 0, 'audi'),
Text(6, 0, 'bmw'),
Text(7, 0, 'skoda'),
Text(8, 0, 'dacia'),
Text(9, 0, 'mercedes_benz')])

import pandas as pd
import plotly.graph_objects as go
# Calculate the frequency counts for brands and models
top_10_brands = df_filtered["brand"].value_counts().nlargest(10).index
top_10_models = df_filtered["model"].value_counts().nlargest(10).index
# Filter the data for the top 10 brands
df_top_10_brands = df_filtered[df_filtered["brand"].isin(top_10_brands)]
# Calculate the frequency counts for models within each brand
top_10_models_per_brand = df_top_10_brands.groupby("brand")["model"].value_counts().groupby(level=0).nlargest(10)
# Create a hierarchical list of labels for the pie chart
labels = []
parents = []
values = []
# Add top 10 brand labels
for brand in top_10_brands:
labels.append(brand)
parents.append("")
values.append(top_10_models_per_brand[brand].sum())
# Add top 10 model labels under each brand
for brand in top_10_brands:
for model in top_10_models_per_brand[brand].index:
labels.append(model)
parents.append(brand)
values.append(top_10_models_per_brand[brand][model])
# Create the pie chart figure using plotly
fig = go.Figure(go.Sunburst(
labels=labels,
parents=parents,
values=values,
))
# Update the layout of the chart
fig.update_layout(
title="Top 10 Brands and Top 10 Models by Count",
sunburstcolorway=["#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52"],
width=800,
height=600
)
# Show the interactive pie chart
# fig.show()df = df_filteredimport pandas as pd
import plotly.express as px
# Filter and select the top 10 brands
top_10_brands = df["brand"].value_counts().nlargest(10).index
df_top_10 = df[df["brand"].isin(top_10_brands)]
# Create a treemap chart using plotly express
fig = px.treemap(df_top_10, path=['brand', 'model'], values='price')
# Configure the layout of the chart
fig.update_layout(
title="Price Distribution by Top 10 Brands and Models",
margin=dict(t=30, l=0, r=0, b=0),
)
# Convert the date column to datetime format
df['date'] = pd.to_datetime(df['date'])
# Group the data by date and count the number of occurrences
date_count = df.groupby('date').size().reset_index(name='count')
# Create a line plot using plotly express
fig = px.line(date_count, x='date', y='count', title='Trend of Cars Posted')
# Configure the layout of the chart
fig.update_layout(
xaxis=dict(title='Date'),
yaxis=dict(title='Count'),
)/var/folders/pf/1lj5fzks0wl1x9ydm5s588q00000gn/T/ipykernel_4297/2298839269.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
import pandas as pd
import plotly.express as px
# Create a treemap chart using plotly express
fig = px.treemap(df, path=['seller', 'offerType', 'gearbox', 'fuelType', 'notRepairedDamage'],
color='gearbox')
# Configure the layout of the chart
fig.update_layout(
title="Relationship between Seller, Offer Type, Gearbox, Fuel Type, and Damage Status",
margin=dict(t=30, l=0, r=0, b=0),
)
# Number of vehicle types
import pandas as pd
import plotly.express as px
# Create a word cloud graph using Plotly Express
fig = px.histogram(df, x="vehicleType", title="Vehicle Type by Count", width=800, height=500)
# Configure the graph as a word cloud
fig.update_traces(marker=dict(color='skyblue'), selector=dict(type='bar'))
fig.update_layout(barmode='stack', xaxis=dict(categoryorder='total descending'))
# MOST POPULAR CAR MODELS
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly.graph_objects as go
# Calculate the count of each vehicle type
vehicle_type_counts = df['model'].value_counts()
# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(vehicle_type_counts)
# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Vehicle Model Word Cloud')
<Figure size 1000x500 with 0 Axes>
<matplotlib.image.AxesImage at 0x7f7ee614de70>
(-0.5, 799.5, 399.5, -0.5)
Text(0.5, 1.0, 'Vehicle Model Word Cloud')

# Count unique values of a categorical variable
brand_counts = df["brand"].value_counts()
# Sort the counts in descending order
brand_counts_sorted = brand_counts.sort_values(ascending=False)
# Create a bar plot of a categorical variable
plt.figure(figsize=(10, 6))
sns.countplot(x="brand", data=df, order=brand_counts_sorted.index)
plt.title("Distribution of Car Brands")
plt.xlabel("Brand")
plt.ylabel("Count")
plt.xticks(rotation=90) # Rotate the x-axis labels by 90 degrees
plt.show()
<Figure size 1000x600 with 0 Axes>
<Axes: xlabel='brand', ylabel='count'>
Text(0.5, 1.0, 'Distribution of Car Brands')
Text(0.5, 0, 'Brand')
Text(0, 0.5, 'Count')
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38]),
[Text(0, 0, 'volkswagen'),
Text(1, 0, 'opel'),
Text(2, 0, 'bmw'),
Text(3, 0, 'mercedes_benz'),
Text(4, 0, 'audi'),
Text(5, 0, 'ford'),
Text(6, 0, 'renault'),
Text(7, 0, 'peugeot'),
Text(8, 0, 'fiat'),
Text(9, 0, 'seat'),
Text(10, 0, 'skoda'),
Text(11, 0, 'mazda'),
Text(12, 0, 'smart'),
Text(13, 0, 'citroen'),
Text(14, 0, 'nissan'),
Text(15, 0, 'toyota'),
Text(16, 0, 'hyundai'),
Text(17, 0, 'volvo'),
Text(18, 0, 'mini'),
Text(19, 0, 'mitsubishi'),
Text(20, 0, 'honda'),
Text(21, 0, 'kia'),
Text(22, 0, 'suzuki'),
Text(23, 0, 'alfa_romeo'),
Text(24, 0, 'chevrolet'),
Text(25, 0, 'chrysler'),
Text(26, 0, 'dacia'),
Text(27, 0, 'daihatsu'),
Text(28, 0, 'subaru'),
Text(29, 0, 'jeep'),
Text(30, 0, 'porsche'),
Text(31, 0, 'land_rover'),
Text(32, 0, 'saab'),
Text(33, 0, 'jaguar'),
Text(34, 0, 'daewoo'),
Text(35, 0, 'lancia'),
Text(36, 0, 'rover'),
Text(37, 0, 'trabant'),
Text(38, 0, 'lada')])

# Count unique values of a categorical variable
model_counts = df["model"].value_counts()
# Sort the counts in descending order
model_counts_sorted = model_counts.sort_values(ascending=False)
# Select the top 20 values
model_counts_top20 = model_counts_sorted[:20]
# Create a bar plot of the top 20 categorical variable values
plt.figure(figsize=(10, 6))
sns.countplot(x="model", data=df, order=model_counts_top20.index)
plt.title("Distribution of Car models (Top 20)")
plt.xlabel("model")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()<Figure size 1000x600 with 0 Axes>
<Axes: xlabel='model', ylabel='count'>
Text(0.5, 1.0, 'Distribution of Car models (Top 20)')
Text(0.5, 0, 'model')
Text(0, 0.5, 'Count')
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19]),
[Text(0, 0, 'golf'),
Text(1, 0, 'andere'),
Text(2, 0, '3er'),
Text(3, 0, 'polo'),
Text(4, 0, 'corsa'),
Text(5, 0, 'astra'),
Text(6, 0, 'passat'),
Text(7, 0, 'a4'),
Text(8, 0, 'c_klasse'),
Text(9, 0, '5er'),
Text(10, 0, 'e_klasse'),
Text(11, 0, 'a3'),
Text(12, 0, 'focus'),
Text(13, 0, 'a6'),
Text(14, 0, 'fiesta'),
Text(15, 0, '2_reihe'),
Text(16, 0, 'transporter'),
Text(17, 0, 'twingo'),
Text(18, 0, 'fortwo'),
Text(19, 0, 'vectra')])

# Calculate the mean price for each model
model_prices = df.groupby("model")["price"].mean()
# Sort the mean prices in descending order
model_prices_sorted = model_prices.sort_values(ascending=False)
# Select the top 20 models by price
top_20_models = model_prices_sorted[:20]
# Create a bar plot of the top 20 models by price
plt.figure(figsize=(10, 6))
sns.barplot(x=top_20_models.index, y=top_20_models.values)
plt.title("Top 20 Car Models by Price")
plt.xlabel("Model")
plt.ylabel("Average Price")
plt.xticks(rotation=90)
plt.show()<Figure size 1000x600 with 0 Axes>
<Axes: xlabel='model'>
Text(0.5, 1.0, 'Top 20 Car Models by Price')
Text(0.5, 0, 'Model')
Text(0, 0.5, 'Average Price')
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19]),
[Text(0, 0, 'q7'),
Text(1, 0, 'glk'),
Text(2, 0, 'a5'),
Text(3, 0, 'q5'),
Text(4, 0, 'range_rover_sport'),
Text(5, 0, 'cc'),
Text(6, 0, 'a1'),
Text(7, 0, 'tiguan'),
Text(8, 0, 'yeti'),
Text(9, 0, 'kuga'),
Text(10, 0, 'serie_3'),
Text(11, 0, 'v60'),
Text(12, 0, 'range_rover_evoque'),
Text(13, 0, 'boxster'),
Text(14, 0, 'cayenne'),
Text(15, 0, 'qashqai'),
Text(16, 0, 'b_max'),
Text(17, 0, 'cx_reihe'),
Text(18, 0, 'sl'),
Text(19, 0, 'defender')])

# Calculate the mean price for each brand
brand_prices = df.groupby("brand")["price"].mean()
# Sort the mean prices in descending order
brand_prices_sorted = brand_prices.sort_values(ascending=False)
# Select the top 20 brands by price
top_20_brands = brand_prices_sorted[:20]
# Create a bar plot of the top 20 brands by price
plt.figure(figsize=(10, 6))
sns.barplot(x=top_20_brands.index, y=top_20_brands.values)
plt.title("Top 20 Car brands by Price")
plt.xlabel("brand")
plt.ylabel("Average Price")
plt.xticks(rotation=90)
plt.show()<Figure size 1000x600 with 0 Axes>
<Axes: xlabel='brand'>
Text(0.5, 1.0, 'Top 20 Car brands by Price')
Text(0.5, 0, 'brand')
Text(0, 0.5, 'Average Price')
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19]),
[Text(0, 0, 'porsche'),
Text(1, 0, 'mini'),
Text(2, 0, 'land_rover'),
Text(3, 0, 'jeep'),
Text(4, 0, 'jaguar'),
Text(5, 0, 'audi'),
Text(6, 0, 'bmw'),
Text(7, 0, 'skoda'),
Text(8, 0, 'dacia'),
Text(9, 0, 'mercedes_benz'),
Text(10, 0, 'chevrolet'),
Text(11, 0, 'hyundai'),
Text(12, 0, 'kia'),
Text(13, 0, 'toyota'),
Text(14, 0, 'volkswagen'),
Text(15, 0, 'volvo'),
Text(16, 0, 'nissan'),
Text(17, 0, 'seat'),
Text(18, 0, 'suzuki'),
Text(19, 0, 'honda')])

import plotly.express as px
# Count unique values of the vehicleType variable
vehicle_type_counts = df["vehicleType"].value_counts()
# Create a pie chart of the vehicleType variable
fig = px.pie(vehicle_type_counts, values=vehicle_type_counts.values, names=vehicle_type_counts.index)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(title="Distribution of Vehicle Types by Count")
import plotly.express as px
# Calculate the average price for each vehicle type
avg_price_by_type = df.groupby("vehicleType")["price"].mean().reset_index()
# Create a pie chart of the vehicleType distribution by average price
fig = px.pie(avg_price_by_type, values="price", names="vehicleType")
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(title="Distribution of Vehicle Types by Average Price")
fig.show()import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
# Convert 'date' and 'dateCreated' columns to datetime
df['date'] = pd.to_datetime(df['date'])
df['dateCreated'] = pd.to_datetime(df['dateCreated'])
# Extract relevant date components if needed (e.g., year, month)
df['yearOfRegistration'] = pd.to_datetime(df['yearOfRegistration'], format='%Y', errors='coerce').dt.year
# Drop columns that are not required for modeling
df = df.drop(['date', 'time', 'name', 'seller', 'offerType', 'abtest', 'dateCreated', 'nrOfPictures', 'postalCode'], axis=1)
# Convert categorical variables to numerical using label encoding
label_encoder = LabelEncoder()
categorical_cols = ['vehicleType', 'gearbox', 'model', 'fuelType', 'brand', 'notRepairedDamage']
for col in categorical_cols:
df[col] = label_encoder.fit_transform(df[col].astype(str))/var/folders/pf/1lj5fzks0wl1x9ydm5s588q00000gn/T/ipykernel_4297/2980117876.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/var/folders/pf/1lj5fzks0wl1x9ydm5s588q00000gn/T/ipykernel_4297/2980117876.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/var/folders/pf/1lj5fzks0wl1x9ydm5s588q00000gn/T/ipykernel_4297/2980117876.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)# Initialize the models
linear_reg = LinearRegression()
random_forest = RandomForestRegressor()
xgb_reg = XGBRegressor()
# Train the models
linear_reg.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
xgb_reg.fit(X_train, y_train)LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
RandomForestRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor()
XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)# Make predictions
linear_reg_preds = linear_reg.predict(X_test)
random_forest_preds = random_forest.predict(X_test)
xgb_reg_preds = xgb_reg.predict(X_test)
# Calculate mean squared error (MSE)
linear_reg_mse = mean_squared_error(y_test, linear_reg_preds)
random_forest_mse = mean_squared_error(y_test, random_forest_preds)
xgb_reg_mse = mean_squared_error(y_test, xgb_reg_preds)
# Print the MSE for each model
print("Linear Regression MSE:", linear_reg_mse)
print("Random Forest MSE:", random_forest_mse)
print("XGBoost MSE:", xgb_reg_mse)
# Select the model with the lowest MSE
best_model = min(linear_reg_mse, random_forest_mse, xgb_reg_mse)
if best_model == linear_reg_mse:
selected_model = linear_reg
model_name = "Linear Regression"
elif best_model == random_forest_mse:
selected_model = random_forest
model_name = "Random Forest"
else:
selected_model = xgb_reg
model_name = "XGBoost"
print("Best Model:", model_name)Linear Regression MSE: 9761443.63885572
Random Forest MSE: 2386370.046739888
XGBoost MSE: 2441435.7550129006
Best Model: Random Forest
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# Define the sample data
sample_data = {
'vehicleType': ['limousine'],
'yearOfRegistration': [2010],
'gearbox': ['automatik'],
'powerPS': [150],
'model': ['Audi A6'],
'kilometer': [100000],
'monthOfRegistration': [6],
'fuelType': ['benzin'],
'brand': ['Audi'],
'notRepairedDamage': ['nein'],
'age': [11]
}
# Create a DataFrame from the sample data
sample_df = pd.DataFrame(sample_data)
# Convert categorical variables to numerical using label encoding
label_encoder = LabelEncoder()
categorical_cols = ['vehicleType', 'gearbox', 'model', 'fuelType', 'brand', 'notRepairedDamage']
for col in categorical_cols:
sample_df[col] = label_encoder.fit_transform(sample_df[col].astype(str))
# Use the best model for price prediction
predicted_price = selected_model.predict(sample_df)
print("Predicted Price:", predicted_price[0])Predicted Price: 7746.22